Loads the airway package

## ----loadairway------------------------------------------------------------
library("airway")
## Loading required package: SummarizedExperiment
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which, which.max, which.min
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: DelayedArray
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians
## Loading required package: BiocParallel
## 
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
## 
##     colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following objects are masked from 'package:base':
## 
##     aperm, apply, rowsum

Sets the variable “dir” to the pathway where the airway package is locally stored

## ----dir-------------------------------------------------------------------
dir <- system.file("extdata", package="airway", mustWork=TRUE)

Lists all of the files and directories in the dir and then specifies for just the quant directories

## ----list.files------------------------------------------------------------
list.files(dir)
##  [1] "GSE52778_series_matrix.txt"        "Homo_sapiens.GRCh37.75_subset.gtf"
##  [3] "quants"                            "sample_table.csv"                 
##  [5] "SraRunInfo_SRP033351.csv"          "SRR1039508_subset.bam"            
##  [7] "SRR1039509_subset.bam"             "SRR1039512_subset.bam"            
##  [9] "SRR1039513_subset.bam"             "SRR1039516_subset.bam"            
## [11] "SRR1039517_subset.bam"             "SRR1039520_subset.bam"            
## [13] "SRR1039521_subset.bam"
list.files(file.path(dir, "quants"))
## [1] "SRR1039508" "SRR1039509"

A comma separated values table is assigned to the variable “csvfile” and the first row of the file is extracted and shown.

## ----sampleinfo------------------------------------------------------------
csvfile <- file.path(dir, "sample_table.csv")
coldata <- read.csv(csvfile, row.names=1, stringsAsFactors=FALSE)
coldata
##            SampleName    cell   dex albut        Run avgLength Experiment
## SRR1039508 GSM1275862  N61311 untrt untrt SRR1039508       126  SRX384345
## SRR1039509 GSM1275863  N61311   trt untrt SRR1039509       126  SRX384346
## SRR1039512 GSM1275866 N052611 untrt untrt SRR1039512       126  SRX384349
## SRR1039513 GSM1275867 N052611   trt untrt SRR1039513        87  SRX384350
## SRR1039516 GSM1275870 N080611 untrt untrt SRR1039516       120  SRX384353
## SRR1039517 GSM1275871 N080611   trt untrt SRR1039517       126  SRX384354
## SRR1039520 GSM1275874 N061011 untrt untrt SRR1039520       101  SRX384357
## SRR1039521 GSM1275875 N061011   trt untrt SRR1039521        98  SRX384358
##               Sample    BioSample
## SRR1039508 SRS508568 SAMN02422669
## SRR1039509 SRS508567 SAMN02422675
## SRR1039512 SRS508571 SAMN02422678
## SRR1039513 SRS508572 SAMN02422670
## SRR1039516 SRS508575 SAMN02422682
## SRR1039517 SRS508576 SAMN02422673
## SRR1039520 SRS508579 SAMN02422683
## SRR1039521 SRS508580 SAMN02422677

A coldata table is created with two columns, names and files, and is populated with the paths to the quant files from the previous section.

## ----makecoldata-----------------------------------------------------------
coldata <- coldata[1:2,]
coldata$names <- coldata$Run
coldata$files <- file.path(dir, "quants", coldata$names, "quant.sf.gz")
file.exists(coldata$files)
## [1] TRUE TRUE

The tximeta package is loaded and run on the coldata table

## ----tximeta, message=TRUE-------------------------------------------------
library("tximeta")
se <- tximeta(coldata)
## importing quantifications
## reading in files with read_tsv
## 1 2 
## found matching transcriptome:
## [ GENCODE - Homo sapiens - release 29 ]
## loading existing TxDb created: 2020-03-04 20:36:23
## Loading required package: GenomicFeatures
## Loading required package: AnnotationDbi
## loading existing transcript ranges created: 2020-03-04 20:37:48
## fetching genome info for GENCODE

“se” is a transcript object that was loaded in the tximeta package. The “dim” call checks the dimensions and the “head” call displays the first lines of the object.

## ----lookse----------------------------------------------------------------
dim(se)
## [1] 205870      2
head(rownames(se))
## [1] "ENST00000456328.2" "ENST00000450305.2" "ENST00000488147.1"
## [4] "ENST00000619216.1" "ENST00000473358.1" "ENST00000469289.1"

The summarizeToGene call replicates then transforms the transcript object “se” to “gse”

## ----summarize, message=TRUE-----------------------------------------------
gse <- summarizeToGene(se)
## loading existing TxDb created: 2020-03-04 20:36:23
## obtaining transcript-to-gene mapping from TxDb
## loading existing gene ranges created: 2020-03-04 20:38:46
## summarizing abundance
## summarizing counts
## summarizing length

The “dim” call checks the dimensions and the “head” call displays the first lines of the gene object.

## ----lookgse---------------------------------------------------------------
dim(gse)
## [1] 58294     2
head(rownames(gse))
## [1] "ENSG00000000003.14" "ENSG00000000005.5"  "ENSG00000000419.12"
## [4] "ENSG00000000457.13" "ENSG00000000460.16" "ENSG00000000938.12"

A graph is created with a plotting method to visualize the intersection of the colData object and the range of rows.

## ----sumexp, echo=FALSE----------------------------------------------------
par(mar=c(0,0,0,0))
plot(1,1,xlim=c(0,100),ylim=c(0,100),bty="n",
     type="n",xlab="",ylab="",xaxt="n",yaxt="n")
polygon(c(45,90,90,45),c(5,5,70,70),col="pink",border=NA)
polygon(c(45,90,90,45),c(68,68,70,70),col="pink3",border=NA)
text(67.5,40,"assay(s)")
text(67.5,35,'e.g. "counts", ...')
polygon(c(10,40,40,10),c(5,5,70,70),col="skyblue",border=NA)
polygon(c(10,40,40,10),c(68,68,70,70),col="skyblue3",border=NA)
text(25,40,"rowRanges")
polygon(c(45,90,90,45),c(75,75,95,95),col="palegreen",border=NA)
polygon(c(45,47,47,45),c(75,75,95,95),col="palegreen3",border=NA)
text(67.5,85,"colData")

The data call loads the full gse object.

## ----loadfullgse-----------------------------------------------------------
data(gse)
gse
## class: RangedSummarizedExperiment 
## dim: 58294 8 
## metadata(6): tximetaInfo quantInfo ... txomeInfo txdbInfo
## assays(3): counts abundance length
## rownames(58294): ENSG00000000003.14 ENSG00000000005.5 ...
##   ENSG00000285993.1 ENSG00000285994.1
## rowData names(1): gene_id
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(3): names donor condition

The “assayNames” call displays the titles of each column, while “head” shows the first few lines of the object and the “colSums” shows the sum of each column in the object.

## ----assaysgse-------------------------------------------------------------
assayNames(gse)
## [1] "counts"    "abundance" "length"
head(assay(gse), 3)
##                    SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003.14    708.164    467.962    900.992    424.368   1188.295
## ENSG00000000005.5       0.000      0.000      0.000      0.000      0.000
## ENSG00000000419.12    455.000    510.000    604.000    352.000    583.000
##                    SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003.14   1090.668    805.929    599.337
## ENSG00000000005.5       0.000      0.000      0.000
## ENSG00000000419.12    773.999    409.999    499.000
colSums(assay(gse))
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517 SRR1039520 
##   21100805   19298584   26145537   15688246   25268618   31891456   19683767 
## SRR1039521 
##   21813903

Shows the ranges for the first and final five genes.

## ----rowrangesgse----------------------------------------------------------
rowRanges(gse)
## GRanges object with 58294 ranges and 1 metadata column:
##                      seqnames              ranges strand |            gene_id
##                         <Rle>           <IRanges>  <Rle> |        <character>
##   ENSG00000000003.14     chrX 100627109-100639991      - | ENSG00000000003.14
##    ENSG00000000005.5     chrX 100584802-100599885      + |  ENSG00000000005.5
##   ENSG00000000419.12    chr20   50934867-50958555      - | ENSG00000000419.12
##   ENSG00000000457.13     chr1 169849631-169894267      - | ENSG00000000457.13
##   ENSG00000000460.16     chr1 169662007-169854080      + | ENSG00000000460.16
##                  ...      ...                 ...    ... .                ...
##    ENSG00000285990.1    chr14   19244904-19269380      - |  ENSG00000285990.1
##    ENSG00000285991.1     chr6 149817937-149896011      - |  ENSG00000285991.1
##    ENSG00000285992.1     chr8   47129262-47132628      + |  ENSG00000285992.1
##    ENSG00000285993.1    chr18   46409197-46410645      - |  ENSG00000285993.1
##    ENSG00000285994.1    chr10   12563151-12567351      + |  ENSG00000285994.1
##   -------
##   seqinfo: 25 sequences (1 circular) from hg38 genome

Shows the information about the “gse” dataframe.

## ----coldatagse------------------------------------------------------------
colData(gse)
## DataFrame with 8 rows and 3 columns
##                 names    donor     condition
##              <factor> <factor>      <factor>
## SRR1039508 SRR1039508   N61311     Untreated
## SRR1039509 SRR1039509   N61311 Dexamethasone
## SRR1039512 SRR1039512  N052611     Untreated
## SRR1039513 SRR1039513  N052611 Dexamethasone
## SRR1039516 SRR1039516  N080611     Untreated
## SRR1039517 SRR1039517  N080611 Dexamethasone
## SRR1039520 SRR1039520  N061011     Untreated
## SRR1039521 SRR1039521  N061011 Dexamethasone

The “gse\(donor" and "gse\)condition” displays each of the parameters in the object.

## ----gsevars---------------------------------------------------------------
gse$donor
## [1] N61311  N61311  N052611 N052611 N080611 N080611 N061011 N061011
## Levels: N052611 N061011 N080611 N61311
gse$condition
## [1] Untreated     Dexamethasone Untreated     Dexamethasone Untreated    
## [6] Dexamethasone Untreated     Dexamethasone
## Levels: Untreated Dexamethasone

Each of these renames these parameters.

## ----gsevarsrename---------------------------------------------------------
gse$cell <- gse$donor
gse$dex <- gse$condition

The “levels(gse$dex) <- c(”untrt“,”trt“)” call casts the names of the parameters to the new vector.

## ----renamelevels----------------------------------------------------------
levels(gse$dex)
## [1] "Untreated"     "Dexamethasone"
# when renaming levels, the order must be preserved!
levels(gse$dex) <- c("untrt", "trt")

This loads the “magrittr” package and executes a pipe with the relevel function.

## ----gsedex----------------------------------------------------------------
library("magrittr")
gse$dex %<>% relevel("untrt")
gse$dex
## [1] untrt trt   untrt trt   untrt trt   untrt trt  
## Levels: untrt trt

This line is not executed, but is the equilvalent to the previous line of code.

## ----explaincmpass, eval = FALSE-------------------------------------------
#  gse$dex <- relevel(gse$dex, "untrt")

The “round” call displays the fragments that can be mapped to the genes.

## ----countreads------------------------------------------------------------
round( colSums(assay(gse)) / 1e6, 1 )
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517 SRR1039520 
##       21.1       19.3       26.1       15.7       25.3       31.9       19.7 
## SRR1039521 
##       21.8

The "DESeq2 package is loaded.

## ----loaddeseq2------------------------------------------------------------
library("DESeq2")

This call creates a fully annotated dds object for the gene analysis.

## ----makedds---------------------------------------------------------------
dds <- DESeqDataSet(gse, design = ~ cell + dex)
## using counts and average transcript lengths from tximeta

This creates a countdata object that creates an assay with the counts from the gse object. The “head” call displays the first lines of the countdata object.

## --------------------------------------------------------------------------
countdata <- round(assays(gse)[["counts"]])
head(countdata, 3)
##                    SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003.14        708        468        901        424       1188
## ENSG00000000005.5           0          0          0          0          0
## ENSG00000000419.12        455        510        604        352        583
##                    SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003.14       1091        806        599
## ENSG00000000005.5           0          0          0
## ENSG00000000419.12        774        410        499

The coldata object is recast to hold the data from the gse object.

## --------------------------------------------------------------------------
coldata <- colData(gse)

An object, “ddsMat”, is created to hold the colData and countData objects.

## --------------------------------------------------------------------------
ddsMat <- DESeqDataSetFromMatrix(countData = countdata,
                                 colData = coldata,
                                 design = ~ cell + dex)
## converting counts to integer mode

This block is used to filter out the rows that contain one or fewer counts to isolate the data rows that have useful data.

## --------------------------------------------------------------------------
nrow(dds)
## [1] 58294
keep <- rowSums(counts(dds)) > 1
dds <- dds[keep,]
nrow(dds)
## [1] 31604

This goes to continue filtering the dataset to isolate rows with a higher data density.

## --------------------------------------------------------------------------
# at least 3 samples with a count of 10 or higher
keep <- rowSums(counts(dds) >= 10) >= 3

The output of this graph is used to show how the variance in the data grwos as the average does.

## ----meanSdCts-------------------------------------------------------------
lambda <- 10^seq(from = -1, to = 2, length = 1000)
cts <- matrix(rpois(1000*100, lambda), ncol = 100)
library("vsn")
meanSdPlot(cts, ranks = FALSE)

This graph is used to show how the log transformed graph has the highest varaince at a significantly lower average.

## ----meanSdLogCts----------------------------------------------------------
log.cts.one <- log2(cts + 1)
meanSdPlot(log.cts.one, ranks = FALSE)

This shows the raw values used to create the first graph.

## ----vst-------------------------------------------------------------------
vsd <- vst(dds, blind = FALSE)
## using 'avgTxLength' from assays(dds), correcting for library size
head(assay(vsd), 3)
##                    SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003.14  10.105781   9.852029  10.169726   9.991545  10.424865
## ENSG00000000419.12   9.692244   9.923647   9.801921   9.798653   9.763455
## ENSG00000000457.13   9.449592   9.312186   9.362754   9.459168   9.281415
##                    SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003.14  10.194490  10.315814  10.002177
## ENSG00000000419.12   9.874703   9.683211   9.845507
## ENSG00000000457.13   9.395937   9.477971   9.477027
colData(vsd)
## DataFrame with 8 rows and 5 columns
##                 names    donor     condition     cell      dex
##              <factor> <factor>      <factor> <factor> <factor>
## SRR1039508 SRR1039508   N61311     Untreated   N61311    untrt
## SRR1039509 SRR1039509   N61311 Dexamethasone   N61311      trt
## SRR1039512 SRR1039512  N052611     Untreated  N052611    untrt
## SRR1039513 SRR1039513  N052611 Dexamethasone  N052611      trt
## SRR1039516 SRR1039516  N080611     Untreated  N080611    untrt
## SRR1039517 SRR1039517  N080611 Dexamethasone  N080611      trt
## SRR1039520 SRR1039520  N061011     Untreated  N061011    untrt
## SRR1039521 SRR1039521  N061011 Dexamethasone  N061011      trt

This shows the raw values used to create the second graph.

## ----rlog------------------------------------------------------------------
rld <- rlog(dds, blind = FALSE)
## using 'avgTxLength' from assays(dds), correcting for library size
head(assay(rld), 3)
##                    SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003.14   9.482613   9.172197   9.558383   9.346001   9.851349
## ENSG00000000419.12   8.860186   9.150196   9.000042   8.995902   8.951327
## ENSG00000000457.13   8.354790   8.166700   8.236582   8.366693   8.121781
##                    SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003.14   9.587602   9.727248   9.357876
## ENSG00000000419.12   9.091075   8.848782   9.054384
## ENSG00000000457.13   8.282307   8.392384   8.391023

The packages “dplyr” and “ggplot2” are loaded and the samples are plotted against eachother to show the differences.

## ----transformplot, fig.width = 6, fig.height = 2.5------------------------
library("dplyr")
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:AnnotationDbi':
## 
##     select
## The following object is masked from 'package:matrixStats':
## 
##     count
## The following object is masked from 'package:Biobase':
## 
##     combine
## The following objects are masked from 'package:GenomicRanges':
## 
##     intersect, setdiff, union
## The following object is masked from 'package:GenomeInfoDb':
## 
##     intersect
## The following objects are masked from 'package:IRanges':
## 
##     collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
## 
##     first, intersect, rename, setdiff, setequal, union
## The following objects are masked from 'package:BiocGenerics':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library("ggplot2")

dds <- estimateSizeFactors(dds)
## using 'avgTxLength' from assays(dds), correcting for library size
df <- bind_rows(
  as_data_frame(log2(counts(dds, normalized=TRUE)[, 1:2]+1)) %>%
         mutate(transformation = "log2(x + 1)"),
  as_data_frame(assay(vsd)[, 1:2]) %>% mutate(transformation = "vst"),
  as_data_frame(assay(rld)[, 1:2]) %>% mutate(transformation = "rlog"))
## Warning: `as_data_frame()` is deprecated, use `as_tibble()` (but mind the new semantics).
## This warning is displayed once per session.
colnames(df)[1:2] <- c("x", "y")  

ggplot(df, aes(x = x, y = y)) + geom_hex(bins = 80) +
  coord_fixed() + facet_grid( . ~ transformation) 

This is used to determine the overall similarity between the samples.

## --------------------------------------------------------------------------
sampleDists <- dist(t(assay(vsd)))
sampleDists
##            SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517
## SRR1039509   39.42362                                                       
## SRR1039512   32.37620   44.93748                                            
## SRR1039513   51.09677   37.18799   41.79886                                 
## SRR1039516   35.59642   47.54671   34.83458   52.05265                      
## SRR1039517   51.26314   41.58572   46.89609   40.67315   39.74268           
## SRR1039520   32.38578   46.96000   30.35980   48.08669   37.07106   50.38349
## SRR1039521   51.49108   37.57383   47.17283   31.45899   52.62276   41.35941
##            SRR1039520
## SRR1039509           
## SRR1039512           
## SRR1039513           
## SRR1039516           
## SRR1039517           
## SRR1039520           
## SRR1039521   43.01502

The packages “pheatmap” and “RColorBrewer” are loaded.

## --------------------------------------------------------------------------
library("pheatmap")
library("RColorBrewer")

This creates a heatmap to show the sample distance matrix.

## ----distheatmap, fig.width = 6.1, fig.height = 4.5------------------------
sampleDistMatrix <- as.matrix( sampleDists )
rownames(sampleDistMatrix) <- paste( vsd$dex, vsd$cell, sep = " - " )
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
         clustering_distance_rows = sampleDists,
         clustering_distance_cols = sampleDists,
         col = colors)

The “PoiClaClu” package is loaded and used to do Poisson Distance analysis.

## --------------------------------------------------------------------------
library("PoiClaClu")
poisd <- PoissonDistance(t(counts(dds)))

The Poisson Distance analysis is used to create a heatmap for the sample distances. The Poisson distance matrix uses the raw data values instead of the normalized ones.

## ----poisdistheatmap, fig.width = 6.1, fig.height = 4.5--------------------
samplePoisDistMatrix <- as.matrix( poisd$dd )
rownames(samplePoisDistMatrix) <- paste( dds$dex, dds$cell, sep=" - " )
colnames(samplePoisDistMatrix) <- NULL
pheatmap(samplePoisDistMatrix,
         clustering_distance_rows = poisd$dd,
         clustering_distance_cols = poisd$dd,
         col = colors)

A Principle Components Analysis plot is created with the to show variance.

## ----plotpca, fig.width=6, fig.height=4.5----------------------------------
plotPCA(vsd, intgroup = c("dex", "cell"))

This uses the ggplot2 package to begin working on the PCA plot from the vsd data.

## --------------------------------------------------------------------------
pcaData <- plotPCA(vsd, intgroup = c( "dex", "cell"), returnData = TRUE)
pcaData
##                   PC1        PC2         group   dex    cell       name
## SRR1039508 -14.311369 -2.6000421  untrt:N61311 untrt  N61311 SRR1039508
## SRR1039509   8.058574 -0.7500532    trt:N61311   trt  N61311 SRR1039509
## SRR1039512  -9.404122 -4.3920761 untrt:N052611 untrt N052611 SRR1039512
## SRR1039513  14.497842 -4.1323833   trt:N052611   trt N052611 SRR1039513
## SRR1039516 -12.365055 11.2109581 untrt:N080611 untrt N080611 SRR1039516
## SRR1039517   9.343946 14.9115160   trt:N080611   trt N080611 SRR1039517
## SRR1039520 -10.852633 -7.7618618 untrt:N061011 untrt N061011 SRR1039520
## SRR1039521  15.032816 -6.4860576   trt:N061011   trt N061011 SRR1039521
percentVar <- round(100 * attr(pcaData, "percentVar"))

ggplot2 is used to create the PCA plot with significantly more control voer the details.

## ----ggplotpca, fig.width=6, fig.height=4.5--------------------------------
ggplot(pcaData, aes(x = PC1, y = PC2, color = dex, shape = cell)) +
  geom_point(size =3) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  coord_fixed() +
  ggtitle("PCA with VST data")

The “glmpca” package is loaded and creates a generalized PCA plot.

## --------------------------------------------------------------------------
library("glmpca")
gpca <- glmpca(counts(dds), L=2)
gpca.dat <- gpca$factors
gpca.dat$dex <- dds$dex
gpca.dat$cell <- dds$cell

## ----glmpca, fig.width=6, fig.height=4.5-----------------------------------
ggplot(gpca.dat, aes(x = dim1, y = dim2, color = dex, shape = cell)) +
  geom_point(size =3) + coord_fixed() + ggtitle("glmpca - Generalized PCA")

An MDS (Multidimensional Scaling) plot is best used with the matrixes of distances instead of the matrixes of data.

## ----mdsvst, fig.width=6, fig.height=4.5-----------------------------------
mds <- as.data.frame(colData(vsd))  %>%
         cbind(cmdscale(sampleDistMatrix))
ggplot(mds, aes(x = `1`, y = `2`, color = dex, shape = cell)) +
  geom_point(size = 3) + coord_fixed() + ggtitle("MDS with VST data")

A similar MDS plot can be created with the Poisson Distances.

## ----mdspois, fig.width=6, fig.height=4.5----------------------------------
mdsPois <- as.data.frame(colData(dds)) %>%
   cbind(cmdscale(samplePoisDistMatrix))
ggplot(mdsPois, aes(x = `1`, y = `2`, color = dex, shape = cell)) +
  geom_point(size = 3) + coord_fixed() + ggtitle("MDS with PoissonDistances")

This runs a differential expression pipeline on the raw counts.

## ----airwayDE--------------------------------------------------------------
dds <- DESeq(dds)
## using pre-existing normalization factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing

The results of the differential expression pipeline are shown here.

## --------------------------------------------------------------------------
res <- results(dds)
res
## log2 fold change (MLE): dex trt vs untrt 
## Wald test p-value: dex trt vs untrt 
## DataFrame with 31604 rows and 6 columns
##                             baseMean      log2FoldChange             lfcSE
##                            <numeric>           <numeric>         <numeric>
## ENSG00000000003.14  739.940717372053  -0.361153676816704 0.106868570706213
## ENSG00000000419.12   511.73572199559   0.206314742225697 0.128664534265382
## ENSG00000000457.13  314.194855097564   0.037830765702789 0.158633410116225
## ENSG00000000460.16  79.7936215457165  -0.115258993162199 0.314990674190846
## ENSG00000000938.12 0.307266835513683   -1.36911852155525  3.50376358863452
## ...                              ...                 ...               ...
## ENSG00000285979.1   38.3538864147071   0.342365713888024 0.359510635836304
## ENSG00000285987.1   1.56250793343253   0.706414476454889   1.5472945399791
## ENSG00000285990.1  0.642314540619725   0.364733264633251  3.43327565299413
## ENSG00000285991.1   11.2762841129775  -0.116551517129412 0.748601160880117
## ENSG00000285994.1   3.65104108073599 -0.0960094332693204   1.0686598396981
##                                  stat               pvalue                padj
##                             <numeric>            <numeric>           <numeric>
## ENSG00000000003.14  -3.37941898567666 0.000726392125305196 0.00531136916201091
## ENSG00000000419.12   1.60350902759384    0.108822317644649   0.293188699547079
## ENSG00000000457.13  0.238479180867837    0.811509460868018    0.92255697455246
## ENSG00000000460.16 -0.365912398702847    0.714430444201867   0.872980377910416
## ENSG00000000938.12 -0.390756535628255    0.695977205170472                  NA
## ...                               ...                  ...                 ...
## ENSG00000285979.1   0.952310390182487    0.340939589953335   0.600750489158651
## ENSG00000285987.1   0.456548160807465    0.647995846834697                  NA
## ENSG00000285990.1   0.106234774453712    0.915396081024322                  NA
## ENSG00000285991.1  -0.155692407679924    0.876275481983982   0.952921308734717
## ENSG00000285994.1  -0.089840966884695    0.928413592935384                  NA

The more controlled results could also have been achieved by directly specifying the results parameters.

## --------------------------------------------------------------------------
res <- results(dds, contrast=c("dex","trt","untrt"))

The “res” dataframe object metadata is shown here.

## --------------------------------------------------------------------------
mcols(res, use.names = TRUE)
## DataFrame with 6 rows and 2 columns
##                        type                               description
##                 <character>                               <character>
## baseMean       intermediate mean of normalized counts for all samples
## log2FoldChange      results  log2 fold change (MLE): dex trt vs untrt
## lfcSE               results          standard error: dex trt vs untrt
## stat                results          Wald statistic: dex trt vs untrt
## pvalue              results       Wald test p-value: dex trt vs untrt
## padj                results                      BH adjusted p-values

A summary of the “res” object is shown here.

## --------------------------------------------------------------------------
summary(res)
## 
## out of 31604 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 2373, 7.5%
## LFC < 0 (down)     : 1949, 6.2%
## outliers [1]       : 0, 0%
## low counts [2]     : 14706, 47%
## (mean count < 9)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

The false discovery rate threshold is lowered and the results function is updated to optimize filtration.

## --------------------------------------------------------------------------
res.05 <- results(dds, alpha = 0.05)
table(res.05$padj < 0.05)
## 
## FALSE  TRUE 
## 13357  3541

This selects for genes that have significant changes by more than a factor of 2 (either 0.5 or 2).

## --------------------------------------------------------------------------
resLFC1 <- results(dds, lfcThreshold=1)
table(resLFC1$padj < 0.1)
## 
## FALSE  TRUE 
## 16687   211

Here the results of the comparison between two levels are shown.

## --------------------------------------------------------------------------
results(dds, contrast = c("cell", "N061011", "N61311"))
## log2 fold change (MLE): cell N061011 vs N61311 
## Wald test p-value: cell N061011 vs N61311 
## DataFrame with 31604 rows and 6 columns
##                             baseMean      log2FoldChange             lfcSE
##                            <numeric>           <numeric>         <numeric>
## ENSG00000000003.14  739.940717372053   0.270945073288714 0.152170652747696
## ENSG00000000419.12   511.73572199559 -0.0718309515716049 0.182816709211393
## ENSG00000000457.13  314.194855097564    0.17988057135378 0.225122028264875
## ENSG00000000460.16  79.7936215457165  -0.119482058987533 0.441593831219506
## ENSG00000000938.12 0.307266835513683                   0   4.9975798104491
## ...                              ...                 ...               ...
## ENSG00000285979.1   38.3538864147071  0.0589757126275811 0.512391427449622
## ENSG00000285987.1   1.56250793343253    1.02168042060967  2.20186060481305
## ENSG00000285990.1  0.642314540619725   -3.09564035666317  4.85271500898552
## ENSG00000285991.1   11.2762841129775  -0.877962796217153  1.04696318967239
## ENSG00000285994.1   3.65104108073599 -0.0192350857348909   1.5132364065081
##                                   stat             pvalue              padj
##                              <numeric>          <numeric>         <numeric>
## ENSG00000000003.14    1.78053434349099 0.0749885539964457  0.37882768043907
## ENSG00000000419.12  -0.392912397786058  0.694384184390243  0.93670340928549
## ENSG00000000457.13   0.799035850646014  0.424269624416598 0.820732883668396
## ENSG00000000460.16  -0.270570036401033  0.786721743935893 0.960661528779669
## ENSG00000000938.12                   0                  1                NA
## ...                                ...                ...               ...
## ENSG00000285979.1    0.115098944806955  0.908366696268635 0.983709901493367
## ENSG00000285987.1     0.46400776614849  0.642642181542788                NA
## ENSG00000285990.1   -0.637919257762125  0.523526241080306                NA
## ENSG00000285991.1   -0.838580386471737   0.40170482075606                NA
## ENSG00000285994.1  -0.0127112232114989  0.989858184362336                NA

Multiple Testing methods are required to provide a P value that disproves the null.

## ----sumres----------------------------------------------------------------
sum(res$pvalue < 0.05, na.rm=TRUE)
## [1] 5170
sum(!is.na(res$pvalue))
## [1] 31604

Here we are looking for the number of genes with a less than 10% false positive rate.

## --------------------------------------------------------------------------
sum(res$padj < 0.1, na.rm=TRUE)
## [1] 4322

The genes are then arranged by strongest downregulation.

## --------------------------------------------------------------------------
resSig <- subset(res, padj < 0.1)
head(resSig[ order(resSig$log2FoldChange), ])
## log2 fold change (MLE): dex trt vs untrt 
## Wald test p-value: dex trt vs untrt 
## DataFrame with 6 rows and 6 columns
##                           baseMean    log2FoldChange             lfcSE
##                          <numeric>         <numeric>         <numeric>
## ENSG00000216490.3 42.3007106248874 -5.72482604043858  1.47565155349927
## ENSG00000267339.5 30.5206063861615 -5.39781111590036 0.773017198758288
## ENSG00000257542.5 10.0398869404764 -5.25991351125893  1.28200100388257
## ENSG00000146006.7 61.6448430808005 -4.49504194549369 0.663821044679586
## ENSG00000108700.4 14.6323552674909 -4.09068745839218 0.941842256424086
## ENSG00000213240.8  12.096158074824 -3.87312575409988  1.27413298748988
##                                stat               pvalue                 padj
##                           <numeric>            <numeric>            <numeric>
## ENSG00000216490.3 -3.87952428665362 0.000104660951388532 0.000987852735620848
## ENSG00000267339.5 -6.98278269173179 2.89389866967638e-12  9.4586266383349e-11
## ENSG00000257542.5 -4.10289344183752 4.08015195589968e-05 0.000430645894758231
## ENSG00000146006.7 -6.77146646904417 1.27483509915645e-11 3.82631678606496e-10
## ENSG00000108700.4 -4.34328299722226 1.40369125072207e-05  0.00016668710298455
## ENSG00000213240.8 -3.03981279201488  0.00236725244353312   0.0144986704569854

They can also be arranged by the strongest upregulation.

## --------------------------------------------------------------------------
head(resSig[ order(resSig$log2FoldChange, decreasing = TRUE), ])
## log2 fold change (MLE): dex trt vs untrt 
## Wald test p-value: dex trt vs untrt 
## DataFrame with 6 rows and 6 columns
##                            baseMean   log2FoldChange             lfcSE
##                           <numeric>        <numeric>         <numeric>
## ENSG00000254692.1  62.2302142486739 10.2071429096331  3.37705826771585
## ENSG00000179593.15 67.0895286534166 9.50514812098004  1.07705493413338
## ENSG00000268173.3  46.4370434386407 8.40438119076897  3.38505617991746
## ENSG00000224712.12 35.5607475035812 7.16685536619654  2.16475512121352
## ENSG00000109906.13 438.193981971508 6.37749569519001 0.313810468763182
## ENSG00000257663.1  24.3946124843467 6.34758399422488  2.09531382411201
##                                stat               pvalue                 padj
##                           <numeric>            <numeric>            <numeric>
## ENSG00000254692.1   3.0224953496396  0.00250699934752643   0.0152166935971629
## ENSG00000179593.15 8.82512843100996 1.09334265101889e-18 6.81745539369636e-17
## ENSG00000268173.3  2.48278927854424   0.0130358175653821   0.0594385443118797
## ENSG00000224712.12  3.3107002708828 0.000930628316571692  0.00655239887226186
## ENSG00000109906.13 20.3227627182916 8.08933590301556e-92 1.24266907353779e-88
## ENSG00000257663.1  3.02941923122899  0.00245024410719956   0.0149150666150786

The plotCounts method shows counts for the groupings of the genes.

## ----plotcounts------------------------------------------------------------
topGene <- rownames(res)[which.min(res$padj)]
plotCounts(dds, gene = topGene, intgroup=c("dex"))

The “ggbeeswarm” package is loaded and used along with ggplot2 to create a custom plotCount plot.

## ----ggplotcountsjitter, fig.width = 4, fig.height = 3---------------------
library("ggbeeswarm")
geneCounts <- plotCounts(dds, gene = topGene, intgroup = c("dex","cell"),
                         returnData = TRUE)
ggplot(geneCounts, aes(x = dex, y = count, color = cell)) +
  scale_y_log10() +  geom_beeswarm(cex = 3)

The normalized counts have connected lines to indicate cell lines impact.

## ----ggplotcountsgroup, fig.width = 4, fig.height = 3----------------------
ggplot(geneCounts, aes(x = dex, y = count, color = cell, group = cell)) +
  scale_y_log10() + geom_point(size = 3) + geom_line()

The “apeglm” packge is loaded and a Minus Average plot is created.

## ----plotma----------------------------------------------------------------
library("apeglm")
resultsNames(dds)
## [1] "Intercept"               "cell_N061011_vs_N052611"
## [3] "cell_N080611_vs_N052611" "cell_N61311_vs_N052611" 
## [5] "dex_trt_vs_untrt"
res <- lfcShrink(dds, coef="dex_trt_vs_untrt", type="apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
plotMA(res, ylim = c(-5, 5))

This is the plot that lacks the log fold change reduction.

## ----plotmaNoShr-----------------------------------------------------------
res.noshr <- results(dds, name="dex_trt_vs_untrt")
plotMA(res.noshr, ylim = c(-5, 5))

This shows how to select specific rows or regions of the graph.

## ----plotmalabel-----------------------------------------------------------
plotMA(res, ylim = c(-5,5))
topGene <- rownames(res)[which.min(res$padj)]
with(res[topGene, ], {
  points(baseMean, log2FoldChange, col="dodgerblue", cex=2, lwd=2)
  text(baseMean, log2FoldChange, topGene, pos=2, col="dodgerblue")
})

The histogram shows the p value frequencies, and is most effective when smallest values are excluded.

## ----histpvalue2-----------------------------------------------------------
hist(res$pvalue[res$baseMean > 1], breaks = 0:20/20,
     col = "grey50", border = "white")

The “genefilter” package is loaded and used to show gene clustering.

## --------------------------------------------------------------------------
library("genefilter")
## 
## Attaching package: 'genefilter'
## The following objects are masked from 'package:matrixStats':
## 
##     rowSds, rowVars
topVarGenes <- head(order(rowVars(assay(vsd)), decreasing = TRUE), 20)

This heatmap shows deviation from the ave as opposed to expression strenght.

## ----genescluster----------------------------------------------------------
mat  <- assay(vsd)[ topVarGenes, ]
mat  <- mat - rowMeans(mat)
anno <- as.data.frame(colData(vsd)[, c("cell","dex")])
pheatmap(mat, annotation_col = anno)

New bins are created with the “quantile” function and the p values for each are calculated.

## ----sensitivityovermean, fig.width=6--------------------------------------
qs <- c(0, quantile(resLFC1$baseMean[resLFC1$baseMean > 0], 0:6/6))
bins <- cut(resLFC1$baseMean, qs)
levels(bins) <- paste0("~", round(signif((qs[-1] + qs[-length(qs)])/2, 2)))
fractionSig <- tapply(resLFC1$pvalue, bins, function(p)
                          mean(p < .05, na.rm = TRUE))
barplot(fractionSig, xlab = "mean normalized count",
                     ylab = "fraction of small p values")

## This line is not executed, but is similar to the previous block of code.

## ---- eval=FALSE-----------------------------------------------------------
#  library("IHW")
#  res.ihw <- results(dds, filterFun=ihw)

The “AnnotationDbi” and “org.Hs.eg.db” packages are loaded.

## --------------------------------------------------------------------------
library("AnnotationDbi")
library("org.Hs.eg.db")
## 

This is a list of the available key types from the AnnotationDbi database.

## --------------------------------------------------------------------------
columns(org.Hs.eg.db)
##  [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS"
##  [6] "ENTREZID"     "ENZYME"       "EVIDENCE"     "EVIDENCEALL"  "GENENAME"    
## [11] "GO"           "GOALL"        "IPI"          "MAP"          "OMIM"        
## [16] "ONTOLOGY"     "ONTOLOGYALL"  "PATH"         "PFAM"         "PMID"        
## [21] "PROSITE"      "REFSEQ"       "SYMBOL"       "UCSCKG"       "UNIGENE"     
## [26] "UNIPROT"

This is used to specify what information is desired from the database.

## --------------------------------------------------------------------------
ens.str <- substr(rownames(res), 1, 15)
res$symbol <- mapIds(org.Hs.eg.db,
                     keys=ens.str,
                     column="SYMBOL",
                     keytype="ENSEMBL",
                     multiVals="first")
## 'select()' returned 1:many mapping between keys and columns
res$entrez <- mapIds(org.Hs.eg.db,
                     keys=ens.str,
                     column="ENTREZID",
                     keytype="ENSEMBL",
                     multiVals="first")
## 'select()' returned 1:many mapping between keys and columns

These are the external gene IDs.

## --------------------------------------------------------------------------
resOrdered <- res[order(res$pvalue),]
head(resOrdered)
## log2 fold change (MAP): dex trt vs untrt 
## Wald test p-value: dex trt vs untrt 
## DataFrame with 6 rows and 7 columns
##                            baseMean   log2FoldChange             lfcSE
##                           <numeric>        <numeric>         <numeric>
## ENSG00000189221.9  2373.80530666857 3.38765439100121 0.136984725340872
## ENSG00000120129.5  3420.72722668767 2.96334836319586 0.120849818517125
## ENSG00000101347.9  14125.5841210142 3.74129175571393 0.157926779740445
## ENSG00000196136.17 2710.21742603852 3.23518345105418 0.143950996111791
## ENSG00000152583.12 974.737366684742 4.48640613542985 0.201275651989011
## ENSG00000211445.11 12512.7915263227 3.75874563219025 0.169535830279915
##                                   pvalue                  padj      symbol
##                                <numeric>             <numeric> <character>
## ENSG00000189221.9  1.84493931066867e-137 3.11757844716792e-133        MAOA
## ENSG00000120129.5   6.3504237592554e-135 5.36547303419488e-131       DUSP1
## ENSG00000101347.9  2.88983300388474e-127 1.62774660332148e-123      SAMHD1
## ENSG00000196136.17 3.68488166768093e-114 1.55667826051181e-110    SERPINA3
## ENSG00000152583.12 2.94551443785581e-113  9.9546605941775e-110     SPARCL1
## ENSG00000211445.11 2.36246306534271e-112 6.65348347969353e-109        GPX3
##                         entrez
##                    <character>
## ENSG00000189221.9         4128
## ENSG00000120129.5         1843
## ENSG00000101347.9        25939
## ENSG00000196136.17          12
## ENSG00000152583.12        8404
## ENSG00000211445.11        2878

This code is not evaluated.

## ----eval=FALSE------------------------------------------------------------
#  resOrderedDF <- as.data.frame(resOrdered)[1:100, ]
#  write.csv(resOrderedDF, file = "results.csv")

## ----eval=FALSE------------------------------------------------------------
#  library("ReportingTools")
#  htmlRep <- HTMLReport(shortName="report", title="My report",
#                        reportDirectory="./report")
#  publish(resOrderedDF, htmlRep)
#  url <- finish(htmlRep)
#  browseURL(url)

This plots the differential expression results in the genomic space.

## --------------------------------------------------------------------------
resGR <- lfcShrink(dds, coef="dex_trt_vs_untrt", type="apeglm", format="GRanges")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
resGR
## GRanges object with 31604 ranges and 5 metadata columns:
##                      seqnames              ranges strand |          baseMean
##                         <Rle>           <IRanges>  <Rle> |         <numeric>
##   ENSG00000000003.14     chrX 100627109-100639991      - |  739.940717372053
##   ENSG00000000419.12    chr20   50934867-50958555      - |   511.73572199559
##   ENSG00000000457.13     chr1 169849631-169894267      - |  314.194855097564
##   ENSG00000000460.16     chr1 169662007-169854080      + |  79.7936215457165
##   ENSG00000000938.12     chr1   27612064-27635277      - | 0.307266835513683
##                  ...      ...                 ...    ... .               ...
##    ENSG00000285979.1    chr16   57177349-57181390      + |  38.3538864147071
##    ENSG00000285987.1     chr9   84316514-84657077      + |  1.56250793343253
##    ENSG00000285990.1    chr14   19244904-19269380      - | 0.642314540619725
##    ENSG00000285991.1     chr6 149817937-149896011      - |  11.2762841129775
##    ENSG00000285994.1    chr10   12563151-12567351      + |  3.65104108073599
##                             log2FoldChange             lfcSE
##                                  <numeric>         <numeric>
##   ENSG00000000003.14    -0.336004620197314 0.105908878209215
##   ENSG00000000419.12     0.178378944093913 0.122440452558955
##   ENSG00000000457.13    0.0299360694683492 0.141095483475456
##   ENSG00000000460.16   -0.0555061349569227 0.222786670799123
##   ENSG00000000938.12   -0.0115799289067039  0.30473963728621
##                  ...                   ...               ...
##    ENSG00000285979.1      0.15284386124239 0.257069545767735
##    ENSG00000285987.1    0.0255152677225242 0.300687312203615
##    ENSG00000285990.1 -0.000185630381439551 0.304464769734699
##    ENSG00000285991.1   -0.0150788219861866 0.283931479729372
##    ENSG00000285994.1  -0.00684681225244801 0.293399449125489
##                                    pvalue                padj
##                                 <numeric>           <numeric>
##   ENSG00000000003.14 0.000726392125305196 0.00531136916201091
##   ENSG00000000419.12    0.108822317644649   0.293188699547079
##   ENSG00000000457.13    0.811509460868018    0.92255697455246
##   ENSG00000000460.16    0.714430444201867   0.872980377910416
##   ENSG00000000938.12    0.695977205170472                <NA>
##                  ...                  ...                 ...
##    ENSG00000285979.1    0.340939589953335   0.600750489158651
##    ENSG00000285987.1    0.647995846834697                <NA>
##    ENSG00000285990.1    0.915396081024322                <NA>
##    ENSG00000285991.1    0.876275481983982   0.952921308734717
##    ENSG00000285994.1    0.928413592935384                <NA>
##   -------
##   seqinfo: 25 sequences (1 circular) from hg38 genome

Symbols are added to include gene labels on the plot

## --------------------------------------------------------------------------
ens.str <- substr(names(resGR), 1, 15)
resGR$symbol <- mapIds(org.Hs.eg.db, ens.str, "SYMBOL", "ENSEMBL")
## 'select()' returned 1:many mapping between keys and columns

The “Gviz” package is loaded.

## --------------------------------------------------------------------------
library("Gviz")
## Loading required package: grid

A million base pairs window on either side of the gene with the smallest p value is set to collect a subset of genes within that window.

## --------------------------------------------------------------------------
window <- resGR[topGene] + 1e6
strand(window) <- "*"
resGRsub <- resGR[resGR %over% window]
naOrDup <- is.na(resGRsub$symbol) | duplicated(resGRsub$symbol)
resGRsub$group <- ifelse(naOrDup, names(resGRsub), resGRsub$symbol)

A vector identifying if the “padj” value of the gene was low.

## --------------------------------------------------------------------------
status <- factor(ifelse(resGRsub$padj < 0.05 & !is.na(resGRsub$padj),
                        "sig", "notsig"))

A plot showing the locations of each gene on is created with functions from the “Gviz” package.

## ----gvizplot--------------------------------------------------------------
options(ucscChromosomeNames = FALSE)
g <- GenomeAxisTrack()
a <- AnnotationTrack(resGRsub, name = "gene ranges", feature = status)
d <- DataTrack(resGRsub, data = "log2FoldChange", baseline = 0,
               type = "h", name = "log2 fold change", strand = "+")
plotTracks(list(g, d, a), groupAnnotation = "group",
           notsig = "grey", sig = "hotpink")

The “sva” package is loaded.

## --------------------------------------------------------------------------
library("sva")
## Loading required package: mgcv
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## The following object is masked from 'package:IRanges':
## 
##     collapse
## This is mgcv 1.8-31. For overview type 'help("mgcv-package")'.

A matrix with the normalized counts is created for samples where the average count was greater than 1.

## --------------------------------------------------------------------------
dat  <- counts(dds, normalized = TRUE)
idx  <- rowMeans(dat) > 1
dat  <- dat[idx, ]
mod  <- model.matrix(~ dex, colData(dds))
mod0 <- model.matrix(~   1, colData(dds))
svseq <- svaseq(dat, mod, mod0, n.sv = 2)
## Number of significant surrogate variables is:  2 
## Iteration (out of 5 ):1  2  3  4  5
svseq$sv
##            [,1]        [,2]
## [1,]  0.2465669 -0.51599084
## [2,]  0.2588137 -0.59462876
## [3,]  0.1384516  0.24920662
## [4,]  0.2179075  0.37716083
## [5,] -0.6042910 -0.06305844
## [6,] -0.6138795 -0.03623320
## [7,]  0.1821306  0.30328185
## [8,]  0.1743002  0.28026195

This is used to check how well the “SVA” method recovered the sources of variation.

## ----svaplot---------------------------------------------------------------
par(mfrow = c(2, 1), mar = c(3,5,3,1))
for (i in 1:2) {
  stripchart(svseq$sv[, i] ~ dds$cell, vertical = TRUE, main = paste0("SV", i))
  abline(h = 0)
 }

This is used to prevent the “SVA” method from impacting the counts.

## --------------------------------------------------------------------------
ddssva <- dds
ddssva$SV1 <- svseq$sv[,1]
ddssva$SV2 <- svseq$sv[,2]
design(ddssva) <- ~ SV1 + SV2 + dex

The “RUVSeq” package is loaded.

## --------------------------------------------------------------------------
library("RUVSeq")
## Loading required package: EDASeq
## Loading required package: ShortRead
## Loading required package: Biostrings
## Loading required package: XVector
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
## 
##     strsplit
## Loading required package: Rsamtools
## Loading required package: GenomicAlignments
## 
## Attaching package: 'GenomicAlignments'
## The following object is masked from 'package:dplyr':
## 
##     last
## 
## Attaching package: 'ShortRead'
## The following objects are masked from 'package:Gviz':
## 
##     chromosome, position
## The following object is masked from 'package:dplyr':
## 
##     id
## The following object is masked from 'package:magrittr':
## 
##     functions
## Loading required package: edgeR
## Loading required package: limma
## 
## Attaching package: 'limma'
## The following object is masked from 'package:DESeq2':
## 
##     plotMA
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA

Usng the “RUVg” method, it’s possible to estimate the factors of unwanted variation, which is similar to the “SVA” surrogate variables.

## --------------------------------------------------------------------------
set <- newSeqExpressionSet(counts(dds))
idx  <- rowSums(counts(set) > 5) >= 2
set  <- set[idx, ]
set <- betweenLaneNormalization(set, which="upper")
not.sig <- rownames(res)[which(res$pvalue > .1)]
empirical <- rownames(set)[ rownames(set) %in% not.sig ]
set <- RUVg(set, empirical, k=2)
pData(set)
##                     W_1         W_2
## SRR1039508 -0.224881168  0.42992983
## SRR1039509 -0.249022928  0.53858506
## SRR1039512  0.001460949  0.01437385
## SRR1039513 -0.175547525  0.08408354
## SRR1039516  0.599387535 -0.02512358
## SRR1039517  0.590516825 -0.02549392
## SRR1039520 -0.241071948 -0.50369551
## SRR1039521 -0.300841739 -0.51265927

This shows how the factors can be estimated by “RUV”.

## ----ruvplot---------------------------------------------------------------
par(mfrow = c(2, 1), mar = c(3,5,3,1))
for (i in 1:2) {
  stripchart(pData(set)[, i] ~ dds$cell, vertical = TRUE, main = paste0("W", i))
  abline(h = 0)
 }

It is possible to control these factors by defining them.

## --------------------------------------------------------------------------
ddsruv <- dds
ddsruv$W1 <- set$W_1
ddsruv$W2 <- set$W_2
design(ddsruv) <- ~ W1 + W2 + dex

The “fission” package is loaded, and a basic time course analysis is run.

## --------------------------------------------------------------------------
library("fission")
data("fission")
ddsTC <- DESeqDataSet(fission, ~ strain + minute + strain:minute)

This is a likelihood ratio test where the strain specific differences are removed over time.

## ----fissionDE-------------------------------------------------------------
ddsTC <- DESeq(ddsTC, test="LRT", reduced = ~ strain + minute)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
resTC <- results(ddsTC)
resTC$symbol <- mcols(ddsTC)$symbol
head(resTC[order(resTC$padj),], 4)
## log2 fold change (MLE): strainmut.minute180 
## LRT p-value: '~ strain + minute + strain:minute' vs '~ strain + minute' 
## DataFrame with 4 rows and 7 columns
##                      baseMean      log2FoldChange             lfcSE
##                     <numeric>           <numeric>         <numeric>
## SPBC2F12.09c 174.671161802578   -2.65671953167536 0.752261272510079
## SPAC1002.18  444.504950375698 -0.0509321395113243  0.20429948485572
## SPAC1002.19  336.373206596558  -0.392748982380861 0.573494009173123
## SPAC1002.17c 261.773132733438   -1.13876476912802  0.60612875682789
##                          stat               pvalue                 padj
##                     <numeric>            <numeric>            <numeric>
## SPBC2F12.09c  97.283386406583 1.97415107461335e-19 1.33452612643862e-15
## SPAC1002.18   56.953598622801 5.16955159226891e-11 1.74730843818689e-07
## SPAC1002.19  43.5339085770392 2.87980357353034e-08 6.48915738568838e-05
## SPAC1002.17c 39.3158355351511 2.05137078380187e-07 0.000346681662462517
##                   symbol
##              <character>
## SPBC2F12.09c       atf21
## SPAC1002.18         urg3
## SPAC1002.19         urg1
## SPAC1002.17c        urg2

ggplot2 is used to show the counts for each gorup over a time frame.

## ----fissioncounts, fig.width=6, fig.height=4.5----------------------------
fiss <- plotCounts(ddsTC, which.min(resTC$padj), 
                   intgroup = c("minute","strain"), returnData = TRUE)
fiss$minute <- as.numeric(as.character(fiss$minute))
ggplot(fiss,
  aes(x = minute, y = count, color = strain, group = strain)) + 
  geom_point() + stat_summary(fun.y=mean, geom="line") +
  scale_y_log10()

A “Wald” test for the log2 fold changes at individual time points.

## --------------------------------------------------------------------------
resultsNames(ddsTC)
##  [1] "Intercept"           "strain_mut_vs_wt"    "minute_15_vs_0"     
##  [4] "minute_30_vs_0"      "minute_60_vs_0"      "minute_120_vs_0"    
##  [7] "minute_180_vs_0"     "strainmut.minute15"  "strainmut.minute30" 
## [10] "strainmut.minute60"  "strainmut.minute120" "strainmut.minute180"
res30 <- results(ddsTC, name="strainmut.minute30", test="Wald")
res30[which.min(resTC$padj),]
## log2 fold change (MLE): strainmut.minute30 
## Wald test p-value: strainmut.minute30 
## DataFrame with 1 row and 6 columns
##                      baseMean    log2FoldChange             lfcSE
##                     <numeric>         <numeric>         <numeric>
## SPBC2F12.09c 174.671161802578 -2.60046902875453 0.634342916343005
##                           stat               pvalue              padj
##                      <numeric>            <numeric>         <numeric>
## SPBC2F12.09c -4.09946885471073 4.14099389595899e-05 0.279931187366828

Cluster significant genes can be shown by their profiles.

## --------------------------------------------------------------------------
betas <- coef(ddsTC)
colnames(betas)
##  [1] "Intercept"           "strain_mut_vs_wt"    "minute_15_vs_0"     
##  [4] "minute_30_vs_0"      "minute_60_vs_0"      "minute_120_vs_0"    
##  [7] "minute_180_vs_0"     "strainmut.minute15"  "strainmut.minute30" 
## [10] "strainmut.minute60"  "strainmut.minute120" "strainmut.minute180"

The log2 fold changes are plotted to a heatmap.

## ----fissionheatmap--------------------------------------------------------
topGenes <- head(order(resTC$padj),20)
mat <- betas[topGenes, -c(1,2)]
thr <- 3 
mat[mat < -thr] <- -thr
mat[mat > thr] <- thr
pheatmap(mat, breaks=seq(from=-thr, to=thr, length=101),
         cluster_col=FALSE)

The information about the analysis pipeline is shown.

## --------------------------------------------------------------------------
sessionInfo()
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Catalina 10.15.2
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
##  [1] grid      parallel  stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] fission_1.6.0               RUVSeq_1.20.0              
##  [3] edgeR_3.28.1                limma_3.42.2               
##  [5] EDASeq_2.20.0               ShortRead_1.44.3           
##  [7] GenomicAlignments_1.22.1    Rsamtools_2.2.3            
##  [9] Biostrings_2.54.0           XVector_0.26.0             
## [11] sva_3.34.0                  mgcv_1.8-31                
## [13] nlme_3.1-144                Gviz_1.30.3                
## [15] org.Hs.eg.db_3.10.0         genefilter_1.68.0          
## [17] apeglm_1.8.0                ggbeeswarm_0.6.0           
## [19] glmpca_0.1.0                PoiClaClu_1.0.2.1          
## [21] RColorBrewer_1.1-2          pheatmap_1.0.12            
## [23] ggplot2_3.2.1               dplyr_0.8.4                
## [25] vsn_3.54.0                  DESeq2_1.26.0              
## [27] magrittr_1.5                GenomicFeatures_1.38.2     
## [29] AnnotationDbi_1.48.0        tximeta_1.4.3              
## [31] airway_1.6.0                SummarizedExperiment_1.16.1
## [33] DelayedArray_0.12.2         BiocParallel_1.20.1        
## [35] matrixStats_0.55.0          Biobase_2.46.0             
## [37] GenomicRanges_1.38.0        GenomeInfoDb_1.22.0        
## [39] IRanges_2.20.2              S4Vectors_0.24.3           
## [41] BiocGenerics_0.32.0        
## 
## loaded via a namespace (and not attached):
##   [1] backports_1.1.5          Hmisc_4.3-1              aroma.light_3.16.0      
##   [4] BiocFileCache_1.10.2     plyr_1.8.6               lazyeval_0.2.2          
##   [7] splines_3.6.2            digest_0.6.25            ensembldb_2.10.2        
##  [10] htmltools_0.4.0          checkmate_2.0.0          memoise_1.1.0           
##  [13] BSgenome_1.54.0          cluster_2.1.0            readr_1.3.1             
##  [16] annotate_1.64.0          R.utils_2.9.2            askpass_1.1             
##  [19] bdsmatrix_1.3-4          prettyunits_1.1.1        jpeg_0.1-8.1            
##  [22] colorspace_1.4-1         blob_1.2.1               rappdirs_0.3.1          
##  [25] xfun_0.12                crayon_1.3.4             RCurl_1.98-1.1          
##  [28] jsonlite_1.6.1           tximport_1.14.0          hexbin_1.28.1           
##  [31] survival_3.1-8           VariantAnnotation_1.32.0 glue_1.3.1              
##  [34] gtable_0.3.0             zlibbioc_1.32.0          DESeq_1.38.0            
##  [37] scales_1.1.0             mvtnorm_1.1-0            DBI_1.1.0               
##  [40] Rcpp_1.0.3               xtable_1.8-4             progress_1.2.2          
##  [43] emdbook_1.3.12           htmlTable_1.13.3         foreign_0.8-76          
##  [46] bit_1.1-15.2             preprocessCore_1.48.0    Formula_1.2-3           
##  [49] htmlwidgets_1.5.1        httr_1.4.1               acepack_1.4.1           
##  [52] R.methodsS3_1.8.0        pkgconfig_2.0.3          XML_3.99-0.3            
##  [55] farver_2.0.3             nnet_7.3-13              dbplyr_1.4.2            
##  [58] locfit_1.5-9.1           tidyselect_1.0.0         labeling_0.3            
##  [61] rlang_0.4.5              reshape2_1.4.3           munsell_0.5.0           
##  [64] tools_3.6.2              RSQLite_2.2.0            evaluate_0.14           
##  [67] stringr_1.4.0            yaml_2.2.1               knitr_1.28              
##  [70] bit64_0.9-7              purrr_0.3.3              AnnotationFilter_1.10.0 
##  [73] R.oo_1.23.0              biomaRt_2.42.0           compiler_3.6.2          
##  [76] rstudioapi_0.11          beeswarm_0.2.3           curl_4.3                
##  [79] png_0.1-7                affyio_1.56.0            tibble_2.1.3            
##  [82] geneplotter_1.64.0       stringi_1.4.6            lattice_0.20-40         
##  [85] ProtGenerics_1.18.0      Matrix_1.2-18            vctrs_0.2.3             
##  [88] pillar_1.4.3             lifecycle_0.1.0          BiocManager_1.30.10     
##  [91] data.table_1.12.8        bitops_1.0-6             rtracklayer_1.46.0      
##  [94] hwriter_1.3.2            R6_2.4.1                 latticeExtra_0.6-29     
##  [97] affy_1.64.0              gridExtra_2.3            vipor_0.4.5             
## [100] dichromat_2.0-0          MASS_7.3-51.5            assertthat_0.2.1        
## [103] openssl_1.4.1            withr_2.1.2              GenomeInfoDbData_1.2.2  
## [106] hms_0.5.3                rpart_4.1-15             coda_0.19-3             
## [109] rmarkdown_2.1            biovizBase_1.34.1        bbmle_1.0.23.1          
## [112] numDeriv_2016.8-1.1      base64enc_0.1-3